{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "cf81e7bf",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "from polyglot.detect import Detector\n",
    "from polyglot.text import Text\n",
    "from polyglot.detect.base import logger as polyglot_logger\n",
    "polyglot_logger.setLevel(\"ERROR\")\n",
    "\n",
    "import preprocessor as p\n",
    "p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION, p.OPT.SMILEY)\n",
    "\n",
    "import pyreadr\n",
    "from datetime import datetime\n",
    "import re\n",
    "\n",
    "from concurrent.futures import ProcessPoolExecutor\n",
    "from dataclasses import dataclass, field\n",
    "\n",
    "from tqdm.notebook import tqdm\n",
    "import timeout_decorator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "0e9356d1",
   "metadata": {},
   "outputs": [],
   "source": [
    "INPUT_FILE = \"tweets/20211007_scrape.pickle\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "930fc9cd",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_pickle(INPUT_FILE)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6dc29c55",
   "metadata": {},
   "source": [
    "Clean the text in order to do language detection, and save it in field `text_lang`.\n",
    "\n",
    "1. If the tweet is a retweet (not quoted retweet), `text_lang = referenced_tweet_text`, because the text is just a truncated version of that\n",
    "2. If the tweet is original, then `text_lang = text`\n",
    "3. If it's a reply, or a quote (or both), then `text_lang = text + referenced_tweet_text`, and run the algorithm on the whole thing."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "594a4646",
   "metadata": {},
   "outputs": [],
   "source": [
    "df['text_lang'] = np.where(df['is_retweet'], df['referenced_tweet_text'], df['text'] + ' ' + df['referenced_tweet_text'])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "872ae3f7",
   "metadata": {},
   "source": [
    "# Add entities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "bc6bb98d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_entities(blob):\n",
    "    \n",
    "    try:\n",
    "        text = Text(blob, hint_language_code='id')\n",
    "        return [(e.tag, tuple(e)[0]) for e in text.entities]\n",
    "\n",
    "    except Exception as e:\n",
    "        print(e)\n",
    "        return []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d3c19d63",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "cebb9116c02c476f936f9bea3905dcc6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1595812 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "with ProcessPoolExecutor() as e:\n",
    "    entity_output = list(tqdm(e.map(get_entities, df.text_lang.values), total=len(df.text_lang.values)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "c838977a",
   "metadata": {},
   "outputs": [],
   "source": [
    "df[\"entities\"] = entity_output"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c48c1645",
   "metadata": {},
   "source": [
    "# Add language cols"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "33586bb0",
   "metadata": {},
   "outputs": [],
   "source": [
    "@dataclass(init=False)\n",
    "class DetectableText:\n",
    "    \n",
    "    def __init__(self, text):\n",
    "        self.text = text\n",
    "        self.languages = Detector(text, quiet=True).languages\n",
    "        \n",
    "    \n",
    "    def get_lang_confidence(self, lang):\n",
    "        if len(self.text) < 5:\n",
    "            return None\n",
    "        else:\n",
    "            conf = [l.confidence for l in self.languages if l.name == lang]\n",
    "            if conf:\n",
    "                return conf[0]\n",
    "            else:\n",
    "                return 0\n",
    "    \n",
    "    \n",
    "    def indo_conf(self):\n",
    "        return self.get_lang_confidence(\"Indonesian\")\n",
    "\n",
    "    \n",
    "    def malay_conf(self):\n",
    "        return self.get_lang_confidence(\"Malay\")\n",
    "\n",
    "    \n",
    "    def no_text(self):\n",
    "        if len(re.sub(r'#(\\w)+|\\n| ', \"\", self.text)) == 0:\n",
    "            return True\n",
    "        else:\n",
    "            return False\n",
    "    \n",
    "    \n",
    "    def find_plm(self):\n",
    "        if \"#papuanlivesmatter\" in self.text.lower():\n",
    "            return True\n",
    "        else:\n",
    "            return False\n",
    "    \n",
    "    def get_tuple(self):\n",
    "        return (self.text, self.indo_conf(), self.malay_conf(), self.no_text())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "058a2fcd",
   "metadata": {},
   "outputs": [],
   "source": [
    "@timeout_decorator.timeout(5)\n",
    "def detect_lang(text):\n",
    "    return DetectableText(p.clean(text)).get_tuple()\n",
    "\n",
    "def process_text(text):\n",
    "    try:\n",
    "        return detect_lang(text)\n",
    "    except Exception as e:\n",
    "        print(e)\n",
    "        return (np.nan, np.nan, np.nan, np.nan)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "973603ac",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7c44c869494343ebb5e84dd38bfd75d0",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1595812 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "expected string or bytes-like object\n",
      "expected string or bytes-like objectexpected string or bytes-like objectexpected string or bytes-like objectexpected string or bytes-like objectexpected string or bytes-like objectexpected string or bytes-like objectexpected string or bytes-like objectexpected string or bytes-like object\n",
      "\n",
      "\n",
      "expected string or bytes-like object\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "expected string or bytes-like object\n",
      "expected string or bytes-like objectexpected string or bytes-like objectexpected string or bytes-like objectexpected string or bytes-like objectexpected string or bytes-like object\n",
      "\n",
      "expected string or bytes-like objectexpected string or bytes-like object\n",
      "expected string or bytes-like object\n",
      "expected string or bytes-like object\n",
      "expected string or bytes-like objectexpected string or bytes-like object\n",
      "\n",
      "expected string or bytes-like object\n",
      "\n",
      "expected string or bytes-like object\n",
      "expected string or bytes-like object\n",
      "\n",
      "\n",
      "\n",
      "expected string or bytes-like objectexpected string or bytes-like objectexpected string or bytes-like object\n",
      "\n",
      "\n",
      "expected string or bytes-like objectexpected string or bytes-like objectexpected string or bytes-like objectexpected string or bytes-like object\n",
      "\n",
      "\n",
      "\n",
      "expected string or bytes-like object\n"
     ]
    }
   ],
   "source": [
    "with ProcessPoolExecutor() as e:\n",
    "    lang_output = list(tqdm(e.map(process_text, df.text_lang.values), total=len(df.text_lang.values)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "b093f414",
   "metadata": {},
   "outputs": [],
   "source": [
    "today = datetime.today().strftime(\"%Y%m%d\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "689bb5ad",
   "metadata": {},
   "outputs": [],
   "source": [
    "df[[\"text_lang\", \"indo_conf\", \"malay_conf\", \"no_text\"]] = lang_output"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b31ed2a4",
   "metadata": {},
   "source": [
    "# Export"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7488d5f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.reset_index().to_pickle(f\"tweets/{today}_filter.pickle\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "82b5dc22",
   "metadata": {},
   "outputs": [],
   "source": [
    "pyreadr.write_rds(f\"tweets/{today}_filtered.Rds\", df.drop(columns=[\"entities\"]), compress=\"gzip\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
